Load required Package

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(brms)
## Loading required package: Rcpp
## Loading 'brms' package (version 2.20.4). Useful instructions
## can be found by typing help('brms'). A more detailed introduction
## to the package is available through vignette('brms_overview').
## 
## Attaching package: 'brms'
## The following object is masked from 'package:stats':
## 
##     ar
library(synthpop)
## Find out more at https://www.synthpop.org.uk/
library(corrplot)
## corrplot 0.92 loaded
library(loo)
## This is loo version 2.6.0
## - Online documentation and vignettes at mc-stan.org/loo
## - As of v2.0.0 loo defaults to 1 core but we recommend using as many as possible. Use the 'cores' argument or set options(mc.cores = NUM_CORES) for an entire session.
## 
## Attaching package: 'loo'
## The following object is masked from 'package:synthpop':
## 
##     compare
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(e1071)
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:brms':
## 
##     rwiener
library(bayesplot)
## This is bayesplot version 1.10.0
## - Online documentation and vignettes at mc-stan.org/bayesplot
## - bayesplot theme set to bayesplot::theme_default()
##    * Does _not_ affect other ggplot2 plots
##    * See ?bayesplot_theme_set for details on theme setting
## 
## Attaching package: 'bayesplot'
## The following object is masked from 'package:brms':
## 
##     rhat
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine

Load original dataset

og_data<- read.csv("cirrhosis.csv")
head(og_data)
ABCDEFGHIJ0123456789
 
 
ID
<int>
N_Days
<int>
Status
<chr>
Drug
<chr>
Age
<int>
Sex
<chr>
Ascites
<chr>
Hepatomegaly
<chr>
Spiders
<chr>
11400DD-penicillamine21464FYYY
224500CD-penicillamine20617FNYY
331012DD-penicillamine25594MNNN
441925DD-penicillamine19994FNYY
551504CLPlacebo13918FNYY
662503DPlacebo24201FNYN

Synyhethic data generation

myseed<-1285
synthetic_data<-syn(og_data, method = "cart", visit.sequence = (1:ncol(og_data)), m=1, k=5000,seed = myseed)
## Sample(s) of size 5000 will be generated from original data of size 418.
## 
## 
## Variable(s): Status, Drug, Sex, Ascites, Hepatomegaly, Spiders, Edema have been changed for synthesis from character to factor.
## Warning: In your synthesis there are numeric variables with 5 or fewer levels: Stage.
## Consider changing them to factors. You can do it using parameter 'minnumlevels'.
## 
## Synthesis
## -----------
##  ID N_Days Status Drug Age Sex Ascites Hepatomegaly Spiders Edema
##  Bilirubin Cholesterol Albumin Copper Alk_Phos SGOT Tryglicerides Platelets Prothrombin Stage
synthethic_data<-as.data.frame(synthetic_data$syn)
head(synthethic_data)
ABCDEFGHIJ0123456789
 
 
ID
<int>
N_Days
<dbl>
Status
<chr>
Drug
<chr>
Age
<dbl>
Sex
<chr>
Ascites
<chr>
Hepatomegaly
<chr>
Spiders
<chr>
1395193DNA17897FNANANA
2842540DD-penicillamine19544FNNN
3146762DPlacebo24803MNYN
4117515DD-penicillamine19577MNYN
53961328CNA22111FNANANA
61702713CPlacebo17809FYYN

Synthetic data evalutaion

create_unique_values_table <- function(data) {
  table <- data.frame(Features = character(), Unique_Values = numeric(), Mean = numeric(), Skewness = numeric(), Kurtosis = numeric(), Max_Category = character(), stringsAsFactors = TRUE)
  for (col in colnames(data)) {
    if (is.numeric(data[[col]])) {
      n_unique <- length(unique(data[[col]]))
      mean_val <- mean(data[[col]], na.rm = TRUE)
      skew_val <- skewness(data[[col]], na.rm = TRUE)
      kurt_val <- kurtosis(data[[col]], na.rm = TRUE)
      max_category <- NA
    } else {
      n_unique <- length(unique(data[[col]]))
      mean_val <- NA
      skew_val <- NA
      kurt_val <- NA
      max_category <- names(sort(table(data[[col]]), decreasing = TRUE)[1])
    }
    table <- rbind(table, data.frame(Features = col, Unique_Values = n_unique, Mean = mean_val, Skewness = skew_val, Kurtosis = kurt_val, Max_Category = max_category))
  }
  return(table)
}

original_table <- create_unique_values_table(og_data)
synthetic_table <- create_unique_values_table(synthethic_data)

og_unique <- original_table$Unique_Values
og_mean <- original_table$Mean
og_skew <- original_table$Skewness
og_kurt <- original_table$Kurtosis
og_max <- original_table$Max_Category

syn_unique <- synthetic_table$Unique_Values
syn_mean <- synthetic_table$Mean
syn_skew <- synthetic_table$Skewness
syn_kurt <- synthetic_table$Kurtosis
syn_max <- synthetic_table$Max_Category

feature <- original_table$Features

comparison_df <- data.frame(
  Feature = feature,
  Original_Unique_Values = og_unique,
  Synthetic_Unique_Values = syn_unique,
  Original_Mean = og_mean,
  Synthetic_Mean = syn_mean,
  Original_Skewness = og_skew,
  Synthetic_Skewness = syn_skew,
  Original_Kurtosis = og_kurt,
  Synthetic_Kurtosis = syn_kurt,
  Original_Max_Category = og_max,
  Synthetic_Max_Category = syn_max
)

print(comparison_df)
##          Feature Original_Unique_Values Synthetic_Unique_Values Original_Mean
## 1             ID                    418                     418    209.500000
## 2         N_Days                    399                     399   1917.782297
## 3         Status                      3                       3            NA
## 4           Drug                      3                       3            NA
## 5            Age                    344                     344  18533.351675
## 6            Sex                      2                       2            NA
## 7        Ascites                      3                       3            NA
## 8   Hepatomegaly                      3                       3            NA
## 9        Spiders                      3                       3            NA
## 10         Edema                      3                       3            NA
## 11     Bilirubin                     98                      98      3.220813
## 12   Cholesterol                    202                     202    369.510563
## 13       Albumin                    154                     154      3.497440
## 14        Copper                    159                     159     97.648387
## 15      Alk_Phos                    296                     296   1982.655769
## 16          SGOT                    180                     180    122.556346
## 17 Tryglicerides                    147                     147    124.702128
## 18     Platelets                    244                     244    257.024570
## 19   Prothrombin                     49                      49     10.731731
## 20         Stage                      5                       5      3.024272
##    Synthetic_Mean Original_Skewness Synthetic_Skewness Original_Kurtosis
## 1      208.732800        0.00000000        0.009417929        -1.2086158
## 2     1890.130400        0.46921558        0.511656340        -0.5027025
## 3              NA                NA                 NA                NA
## 4              NA                NA                 NA                NA
## 5    18623.209200        0.08622782        0.114668567        -0.6350537
## 6              NA                NA                 NA                NA
## 7              NA                NA                 NA                NA
## 8              NA                NA                 NA                NA
## 9              NA                NA                 NA                NA
## 10             NA                NA                 NA                NA
## 11       3.409260        2.69813743        2.572163421         7.9025102
## 12     393.721673        3.37260482        3.025709201        13.9456623
## 13       3.489242       -0.46417641       -0.488826445         0.5287235
## 14      97.788425        2.28139465        2.420272874         7.4147980
## 15    2042.974659        2.96411855        2.834845920         9.4092979
## 16     125.905795        1.43529211        1.503921484         4.1777801
## 17     130.383963        2.49711592        2.440358524        11.4701457
## 18     257.449539        0.62248299        0.617158166         0.8189369
## 19      10.765287        2.20726861        2.260816276         9.8441333
## 20       3.066775       -0.49266562       -0.534760694        -0.6565785
##    Synthetic_Kurtosis Original_Max_Category Synthetic_Max_Category
## 1          -1.1988904                  <NA>                   <NA>
## 2          -0.4402123                  <NA>                   <NA>
## 3                  NA                     C                      C
## 4                  NA       D-penicillamine        D-penicillamine
## 5          -0.6610101                  <NA>                   <NA>
## 6                  NA                     F                      F
## 7                  NA                     N                      N
## 8                  NA                     Y                      Y
## 9                  NA                     N                      N
## 10                 NA                     N                      N
## 11          6.9864052                  <NA>                   <NA>
## 12         10.2915316                  <NA>                   <NA>
## 13          0.5657029                  <NA>                   <NA>
## 14          8.6058074                  <NA>                   <NA>
## 15          8.5054136                  <NA>                   <NA>
## 16          4.4141960                  <NA>                   <NA>
## 17         10.9591418                  <NA>                   <NA>
## 18          0.9213549                  <NA>                   <NA>
## 19          9.3532839                  <NA>                   <NA>
## 20         -0.5788484                  <NA>                   <NA>

Now we wiil consider synthetic data as main and wiil try to gain insights about the feature on this data, and then we will perform train and test split and evaluate model on the test data.

Now we have to pre-process data(One hot encoding of categorical variable and some new feature generation)

og_data <- og_data %>%
  mutate(Status = recode(Status, "D" = 1, "C" = 0, "CL" = 2),
         Status = as.numeric(Status))
# ONE HOT ENCODING OF CATAGORICAL VARIABLE
og_data$Age <- og_data$Age /365.25
og_data$Sex <- as.integer(og_data$Sex == "M")
og_data$Drug <- as.integer(og_data$Drug == "D-penicillamine")
og_data$Edema <- as.integer(og_data$Edema == 'Y')
og_data <- subset(og_data, select = -ID)
og_data <- og_data[order(og_data$Drug), ]
og_data$Ascites <- as.integer(og_data$Ascites == 'Y')
og_data$Spiders <- as.integer(og_data$Spiders == 'Y')
og_data$Hepatomegaly <- as.integer(og_data$Hepatomegaly == 'Y')
head(og_data)
ABCDEFGHIJ0123456789
 
 
N_Days
<int>
Status
<dbl>
Drug
<int>
Age
<dbl>
Sex
<int>
Ascites
<int>
Hepatomegaly
<int>
Spiders
<int>
Edema
<int>
515042038.1054100110
625031066.2587300100
718320055.5345700100
824661053.0568100000
10511070.5598901011
1137621053.7138900110
#feature generation
og_data<-na.omit(og_data)
og_data$Age <- round(og_data$Age)
threshold_platelets <- 150
og_data$thrombocytopenia <- ifelse(og_data$Platelets < threshold_platelets, 1, 0)
threshold_alk_phos_upper <- 147  # Upper limit of normal range
threshold_alk_phos_lower <- 44   # Lower limit of normal range
og_data$elevated_alk_phos <- ifelse(og_data$Alk_Phos > threshold_alk_phos_upper | og_data$Alk_Phos < threshold_alk_phos_lower, 1, 0)
normal_copper_range <- c(62, 140)
og_data$normal_copper <- ifelse(og_data$Copper >= normal_copper_range[1] & og_data$Copper <= normal_copper_range[2], 1, 0)
normal_albumin_range <- c(3.4, 5.4)
og_data$normal_albumin <- ifelse(og_data$Albumin >= normal_albumin_range[1] & og_data$Albumin <= normal_albumin_range[2], 1, 0)
og_data$DiagnosisDays <- og_data$Age - og_data$N_Days
og_data$Age_Group <- cut(og_data$Age, breaks = c(19, 29, 49, 64, 99), labels = c(0, 1, 2, 3))
og_data$Age_Group <- as.integer(as.character(og_data$Age_Group))
og_data$Symptom_Score <- rowSums(og_data[, c("Ascites", "Hepatomegaly", "Spiders")])
og_data$Liver_Function_Index <- rowMeans(og_data[, c("Bilirubin", "Albumin", "Alk_Phos", "SGOT")])
og_data$Bilirubin_Albumin<-og_data$Bilirubin * og_data$Albumin
og_data$Risk_Score <- og_data$Bilirubin + og_data$Albumin - og_data$Alk_Phos
og_data$Diag_Month <- as.integer((og_data$N_Days %% 365) / 30)
og_data$Diag_Year <- as.integer(og_data$N_Days / 365)
og_data$N_Days <- og_data$N_Days / 365.25
head(og_data)
ABCDEFGHIJ0123456789
 
 
N_Days
<dbl>
Status
<dbl>
Drug
<int>
Age
<dbl>
Sex
<int>
Ascites
<int>
Hepatomegaly
<int>
Spiders
<int>
Edema
<int>
54.1177276203800110
75.0157426005600100
86.7515400105300000
100.1396304107101011
1110.2997947105400110
120.8323066105900010

Explanatory Analysis(Distribution for each class)

library(ggplot2)
numeric_cols <- c('N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper',
                  'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin',
                  'Risk_Score', 'Liver_Function_Index', 'DiagnosisDays', 'Bilirubin_Albumin',
                  'Diag_Year', 'Diag_Month')

train_to_scale <- og_data[, numeric_cols]
ultra_light_colors <- c("#F0F8FF", "#F6F6F6", "#F0FFF0", "#FAFAD2", "#FFE4E1", 
                        "#FFF5EE", "#F5FFFA", "#F0FFFF", "#FFFAF0", "#F8F8FF")

col_per_class <- function(og_data, col) {
  ggplot(og_data, aes(x = factor(Status), y = .data[[col]], fill = factor(Status))) +
    geom_violin(trim = FALSE) +
    geom_boxplot(width = 0.1, outlier.shape = NA, fill = "white", color = "black") +
    stat_summary(fun = mean, geom = "point", shape = 23, size = 3, fill = "black") +
    labs(title = paste("Distribution for", col, "for each class")) +
    scale_fill_manual(values = ultra_light_colors) +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5))
}
plots <- lapply(numeric_cols, function(col) col_per_class(og_data, col))
plots
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

According to the similarity , we have come to know that C and Cl are close enough. so we decide to merge c=0, cl=2 to 0 so that we will have 2 categories from which we will make predictions.

og_data <- og_data %>%
  mutate(Status = recode(Status,"0"="0", "1" = "1", "2" = "0"))

Standerdization(to make all feature in same scale)

threshold <- 5
group_variables <- names(og_data)[sapply(og_data, function(x) length(unique(x))) < threshold]
og_data[group_variables] <- lapply(og_data[group_variables], factor)
og_data$elevated_alk_phos<- as.numeric(og_data$elevated_alk_phos)
non_group_variables <- setdiff(colnames(og_data), group_variables)
og_data[non_group_variables] <- scale(og_data[non_group_variables])

Check the correlation matrix and exclude the highly correlated variable to tackle the over fitting tendency

correlation_matrix <- cor(og_data[non_group_variables], use = "pairwise.complete.obs")
corrplot(correlation_matrix, method = "color", type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45)

Columns to drop due to correlation

columns_to_drop <- c('DiagnosisDays', 'Bilirubin_Albumin','Diag_Year', 'Risk_Score', 'Liver_Function_Index')
og_data <- og_data[, !names(og_data) %in% columns_to_drop]

Our final data set

head(og_data)
ABCDEFGHIJ0123456789
 
 
N_Days
<dbl>
Status
<fct>
Drug
<fct>
Age
<dbl>
Sex
<fct>
Ascites
<fct>
Hepatomegaly
<fct>
Spiders
<fct>
Edema
<fct>
5-0.427162100-1.119921400110
7-0.1322989000.588853000100
80.4376501100.304057300000
10-1.7333700102.012831701011
111.6027193100.398989200110
12-1.5059298100.873648800010

Train and Test split of final dataset

set.seed(12348)
index <- createDataPartition(og_data$Status, p = 0.7, list = FALSE)
train_data <- og_data[index, ]
test_data <- og_data[-index, ]

Fit Bayesian logistic regression model with all features and considering group level effects(mega_model)

Model1 <- brm(Status ~ . + (1|Age_Group) + (1|Stage) + (1| Symptom_Score),
            data = train_data, 
            family = bernoulli(link = "logit"),
            prior = c(set_prior("student_t(4, 1, 2)", class = "b"), 
                      set_prior("normal(-0.5,0.25)", coef = "Drug1", class = "b")),
            control = list(adapt_delta = 0.98),
            iter = 4000, warmup = 2000)
## Compiling Stan program...
## Start sampling
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0.00011 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 1.1 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 4000 [  0%]  (Warmup)
## Chain 1: Iteration:  400 / 4000 [ 10%]  (Warmup)
## Chain 1: Iteration:  800 / 4000 [ 20%]  (Warmup)
## Chain 1: Iteration: 1200 / 4000 [ 30%]  (Warmup)
## Chain 1: Iteration: 1600 / 4000 [ 40%]  (Warmup)
## Chain 1: Iteration: 2000 / 4000 [ 50%]  (Warmup)
## Chain 1: Iteration: 2001 / 4000 [ 50%]  (Sampling)
## Chain 1: Iteration: 2400 / 4000 [ 60%]  (Sampling)
## Chain 1: Iteration: 2800 / 4000 [ 70%]  (Sampling)
## Chain 1: Iteration: 3200 / 4000 [ 80%]  (Sampling)
## Chain 1: Iteration: 3600 / 4000 [ 90%]  (Sampling)
## Chain 1: Iteration: 4000 / 4000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 4.471 seconds (Warm-up)
## Chain 1:                4.032 seconds (Sampling)
## Chain 1:                8.503 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 2.2e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.22 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 4000 [  0%]  (Warmup)
## Chain 2: Iteration:  400 / 4000 [ 10%]  (Warmup)
## Chain 2: Iteration:  800 / 4000 [ 20%]  (Warmup)
## Chain 2: Iteration: 1200 / 4000 [ 30%]  (Warmup)
## Chain 2: Iteration: 1600 / 4000 [ 40%]  (Warmup)
## Chain 2: Iteration: 2000 / 4000 [ 50%]  (Warmup)
## Chain 2: Iteration: 2001 / 4000 [ 50%]  (Sampling)
## Chain 2: Iteration: 2400 / 4000 [ 60%]  (Sampling)
## Chain 2: Iteration: 2800 / 4000 [ 70%]  (Sampling)
## Chain 2: Iteration: 3200 / 4000 [ 80%]  (Sampling)
## Chain 2: Iteration: 3600 / 4000 [ 90%]  (Sampling)
## Chain 2: Iteration: 4000 / 4000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 4.418 seconds (Warm-up)
## Chain 2:                4.001 seconds (Sampling)
## Chain 2:                8.419 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 2e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.2 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 4000 [  0%]  (Warmup)
## Chain 3: Iteration:  400 / 4000 [ 10%]  (Warmup)
## Chain 3: Iteration:  800 / 4000 [ 20%]  (Warmup)
## Chain 3: Iteration: 1200 / 4000 [ 30%]  (Warmup)
## Chain 3: Iteration: 1600 / 4000 [ 40%]  (Warmup)
## Chain 3: Iteration: 2000 / 4000 [ 50%]  (Warmup)
## Chain 3: Iteration: 2001 / 4000 [ 50%]  (Sampling)
## Chain 3: Iteration: 2400 / 4000 [ 60%]  (Sampling)
## Chain 3: Iteration: 2800 / 4000 [ 70%]  (Sampling)
## Chain 3: Iteration: 3200 / 4000 [ 80%]  (Sampling)
## Chain 3: Iteration: 3600 / 4000 [ 90%]  (Sampling)
## Chain 3: Iteration: 4000 / 4000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 4.663 seconds (Warm-up)
## Chain 3:                4.24 seconds (Sampling)
## Chain 3:                8.903 seconds (Total)
## Chain 3: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 4).
## Chain 4: 
## Chain 4: Gradient evaluation took 2e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.2 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4: 
## Chain 4: 
## Chain 4: Iteration:    1 / 4000 [  0%]  (Warmup)
## Chain 4: Iteration:  400 / 4000 [ 10%]  (Warmup)
## Chain 4: Iteration:  800 / 4000 [ 20%]  (Warmup)
## Chain 4: Iteration: 1200 / 4000 [ 30%]  (Warmup)
## Chain 4: Iteration: 1600 / 4000 [ 40%]  (Warmup)
## Chain 4: Iteration: 2000 / 4000 [ 50%]  (Warmup)
## Chain 4: Iteration: 2001 / 4000 [ 50%]  (Sampling)
## Chain 4: Iteration: 2400 / 4000 [ 60%]  (Sampling)
## Chain 4: Iteration: 2800 / 4000 [ 70%]  (Sampling)
## Chain 4: Iteration: 3200 / 4000 [ 80%]  (Sampling)
## Chain 4: Iteration: 3600 / 4000 [ 90%]  (Sampling)
## Chain 4: Iteration: 4000 / 4000 [100%]  (Sampling)
## Chain 4: 
## Chain 4:  Elapsed Time: 5.1 seconds (Warm-up)
## Chain 4:                3.964 seconds (Sampling)
## Chain 4:                9.064 seconds (Total)
## Chain 4:
summary(Model1)
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: Status ~ N_Days + Drug + Age + Sex + Ascites + Hepatomegaly + Spiders + Edema + Bilirubin + Cholesterol + Albumin + Copper + Alk_Phos + SGOT + Tryglicerides + Platelets + Prothrombin + Stage + thrombocytopenia + elevated_alk_phos + normal_copper + normal_albumin + Age_Group + Symptom_Score + Diag_Month + (1 | Age_Group) + (1 | Stage) + (1 | Symptom_Score) 
##    Data: train_data (Number of observations: 194) 
##   Draws: 4 chains, each with iter = 4000; warmup = 2000; thin = 1;
##          total post-warmup draws = 8000
## 
## Group-Level Effects: 
## ~Age_Group (Number of levels: 4) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.42      1.20     0.06     4.50 1.00     5711     4428
## 
## ~Stage (Number of levels: 4) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.36      1.16     0.05     4.20 1.00     3866     4034
## 
## ~Symptom_Score (Number of levels: 4) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.43      1.27     0.05     4.73 1.00     4241     4535
## 
## Population-Level Effects: 
##                   Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept            -4.93      3.64   -12.23     2.22 1.00     8723     4973
## N_Days               -0.81      0.33    -1.46    -0.18 1.00     9938     6111
## Drug1                -0.22      0.23    -0.66     0.22 1.00    15755     5817
## Age                   0.21      0.53    -0.85     1.26 1.00    11149     5334
## Sex1                  0.77      0.77    -0.77     2.29 1.00     9251     6515
## Ascites1              1.04      1.64    -1.93     4.51 1.00     9567     5005
## Hepatomegaly1        -1.34      1.16    -3.64     0.87 1.00     6097     5342
## Spiders1              0.72      1.09    -1.41     2.89 1.00     6137     5343
## Edema1                6.31      3.33     1.92    14.52 1.00     6435     3403
## Bilirubin             1.45      0.66     0.23     2.80 1.00     9770     6607
## Cholesterol           0.06      0.30    -0.52     0.64 1.00    10602     6060
## Albumin               0.10      0.46    -0.81     1.01 1.00     7187     5962
## Copper               -0.12      0.38    -0.89     0.62 1.00     8726     6369
## Alk_Phos              1.05      0.34     0.44     1.77 1.00     9376     5452
## SGOT                  0.60      0.28     0.06     1.19 1.00    11463     5987
## Tryglicerides         0.48      0.33    -0.15     1.13 1.00    10733     6516
## Platelets             0.28      0.33    -0.36     0.93 1.00     9218     6580
## Prothrombin           0.90      0.30     0.33     1.51 1.00    11474     6101
## Stage2                2.02      1.55    -1.14     5.00 1.00     6775     5793
## Stage3                1.52      1.47    -1.39     4.48 1.00     7926     5350
## Stage4                1.57      1.48    -1.52     4.63 1.00     8572     5744
## thrombocytopenia1     1.17      0.95    -0.70     3.05 1.00     9944     6291
## elevated_alk_phos     0.99      2.67    -4.57     6.46 1.00    11119     3454
## normal_copper1       -0.51      0.58    -1.69     0.62 1.00    10143     6274
## normal_albumin1       0.22      0.76    -1.25     1.71 1.00     8744     7106
## Age_Group1            0.17      1.80    -3.38     3.77 1.00     8481     5564
## Age_Group2            2.09      1.70    -1.24     5.52 1.00     7476     5683
## Age_Group3            0.94      1.74    -2.54     4.42 1.00     8465     5457
## Symptom_Score1        1.56      1.44    -1.36     4.46 1.00     7046     4962
## Symptom_Score2        1.67      1.74    -1.76     5.31 1.00     6926     4956
## Symptom_Score3       -1.10      2.45    -6.66     2.99 1.00     8142     4905
## Diag_Month            0.86      0.31     0.27     1.48 1.00    11407     6400
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Now same model with different prior (*prior sensitivity analysis)

Model2 <- brm(Status ~ . + (1|Age_Group) + (1|Stage) + (1| Symptom_Score),
            data = train_data, 
            family = bernoulli(link = "logit"),
            prior = set_prior("student_t(3, 0, 2)", class = "b"),
            control = list(adapt_delta = 0.98),
            iter = 4000, warmup = 2000)
## Compiling Stan program...
## Start sampling
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 7.9e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.79 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 4000 [  0%]  (Warmup)
## Chain 1: Iteration:  400 / 4000 [ 10%]  (Warmup)
## Chain 1: Iteration:  800 / 4000 [ 20%]  (Warmup)
## Chain 1: Iteration: 1200 / 4000 [ 30%]  (Warmup)
## Chain 1: Iteration: 1600 / 4000 [ 40%]  (Warmup)
## Chain 1: Iteration: 2000 / 4000 [ 50%]  (Warmup)
## Chain 1: Iteration: 2001 / 4000 [ 50%]  (Sampling)
## Chain 1: Iteration: 2400 / 4000 [ 60%]  (Sampling)
## Chain 1: Iteration: 2800 / 4000 [ 70%]  (Sampling)
## Chain 1: Iteration: 3200 / 4000 [ 80%]  (Sampling)
## Chain 1: Iteration: 3600 / 4000 [ 90%]  (Sampling)
## Chain 1: Iteration: 4000 / 4000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 4.248 seconds (Warm-up)
## Chain 1:                3.598 seconds (Sampling)
## Chain 1:                7.846 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 1.8e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.18 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 4000 [  0%]  (Warmup)
## Chain 2: Iteration:  400 / 4000 [ 10%]  (Warmup)
## Chain 2: Iteration:  800 / 4000 [ 20%]  (Warmup)
## Chain 2: Iteration: 1200 / 4000 [ 30%]  (Warmup)
## Chain 2: Iteration: 1600 / 4000 [ 40%]  (Warmup)
## Chain 2: Iteration: 2000 / 4000 [ 50%]  (Warmup)
## Chain 2: Iteration: 2001 / 4000 [ 50%]  (Sampling)
## Chain 2: Iteration: 2400 / 4000 [ 60%]  (Sampling)
## Chain 2: Iteration: 2800 / 4000 [ 70%]  (Sampling)
## Chain 2: Iteration: 3200 / 4000 [ 80%]  (Sampling)
## Chain 2: Iteration: 3600 / 4000 [ 90%]  (Sampling)
## Chain 2: Iteration: 4000 / 4000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 4.303 seconds (Warm-up)
## Chain 2:                3.548 seconds (Sampling)
## Chain 2:                7.851 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 1.8e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.18 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 4000 [  0%]  (Warmup)
## Chain 3: Iteration:  400 / 4000 [ 10%]  (Warmup)
## Chain 3: Iteration:  800 / 4000 [ 20%]  (Warmup)
## Chain 3: Iteration: 1200 / 4000 [ 30%]  (Warmup)
## Chain 3: Iteration: 1600 / 4000 [ 40%]  (Warmup)
## Chain 3: Iteration: 2000 / 4000 [ 50%]  (Warmup)
## Chain 3: Iteration: 2001 / 4000 [ 50%]  (Sampling)
## Chain 3: Iteration: 2400 / 4000 [ 60%]  (Sampling)
## Chain 3: Iteration: 2800 / 4000 [ 70%]  (Sampling)
## Chain 3: Iteration: 3200 / 4000 [ 80%]  (Sampling)
## Chain 3: Iteration: 3600 / 4000 [ 90%]  (Sampling)
## Chain 3: Iteration: 4000 / 4000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 4.211 seconds (Warm-up)
## Chain 3:                3.427 seconds (Sampling)
## Chain 3:                7.638 seconds (Total)
## Chain 3: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 4).
## Chain 4: 
## Chain 4: Gradient evaluation took 1.6e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.16 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4: 
## Chain 4: 
## Chain 4: Iteration:    1 / 4000 [  0%]  (Warmup)
## Chain 4: Iteration:  400 / 4000 [ 10%]  (Warmup)
## Chain 4: Iteration:  800 / 4000 [ 20%]  (Warmup)
## Chain 4: Iteration: 1200 / 4000 [ 30%]  (Warmup)
## Chain 4: Iteration: 1600 / 4000 [ 40%]  (Warmup)
## Chain 4: Iteration: 2000 / 4000 [ 50%]  (Warmup)
## Chain 4: Iteration: 2001 / 4000 [ 50%]  (Sampling)
## Chain 4: Iteration: 2400 / 4000 [ 60%]  (Sampling)
## Chain 4: Iteration: 2800 / 4000 [ 70%]  (Sampling)
## Chain 4: Iteration: 3200 / 4000 [ 80%]  (Sampling)
## Chain 4: Iteration: 3600 / 4000 [ 90%]  (Sampling)
## Chain 4: Iteration: 4000 / 4000 [100%]  (Sampling)
## Chain 4: 
## Chain 4:  Elapsed Time: 4.007 seconds (Warm-up)
## Chain 4:                3.457 seconds (Sampling)
## Chain 4:                7.464 seconds (Total)
## Chain 4:
summary(Model2)
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: Status ~ N_Days + Drug + Age + Sex + Ascites + Hepatomegaly + Spiders + Edema + Bilirubin + Cholesterol + Albumin + Copper + Alk_Phos + SGOT + Tryglicerides + Platelets + Prothrombin + Stage + thrombocytopenia + elevated_alk_phos + normal_copper + normal_albumin + Age_Group + Symptom_Score + Diag_Month + (1 | Age_Group) + (1 | Stage) + (1 | Symptom_Score) 
##    Data: train_data (Number of observations: 194) 
##   Draws: 4 chains, each with iter = 4000; warmup = 2000; thin = 1;
##          total post-warmup draws = 8000
## 
## Group-Level Effects: 
## ~Age_Group (Number of levels: 4) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.40      1.25     0.05     4.75 1.00     5856     3933
## 
## ~Stage (Number of levels: 4) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.66      1.30     0.07     4.87 1.00     4737     4757
## 
## ~Symptom_Score (Number of levels: 4) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.59      1.37     0.06     5.05 1.00     4933     4718
## 
## Population-Level Effects: 
##                   Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept            -2.50      4.10   -10.23     5.69 1.00     8612     4175
## N_Days               -0.75      0.31    -1.37    -0.16 1.00    11042     6428
## Drug1                 0.98      0.53    -0.03     2.03 1.00    14578     6722
## Age                   0.18      0.52    -0.88     1.21 1.00    12365     6010
## Sex1                  0.58      0.77    -0.93     2.10 1.00    12135     6080
## Ascites1              0.60      1.70    -2.46     4.29 1.00    11254     4101
## Hepatomegaly1        -1.39      1.19    -3.97     0.76 1.00     7182     5432
## Spiders1              0.57      1.13    -1.69     2.81 1.00     6946     5916
## Edema1                8.38      5.33     2.34    22.08 1.00     5404     3068
## Bilirubin             1.59      0.67     0.36     2.98 1.00    10435     6451
## Cholesterol          -0.04      0.31    -0.64     0.56 1.00    11779     6373
## Albumin               0.05      0.45    -0.83     0.94 1.00     9349     6462
## Copper               -0.14      0.36    -0.88     0.55 1.00     8914     5716
## Alk_Phos              1.04      0.33     0.44     1.70 1.00    11028     6359
## SGOT                  0.65      0.27     0.14     1.20 1.00    12885     6592
## Tryglicerides         0.51      0.33    -0.12     1.18 1.00    11876     6555
## Platelets             0.22      0.34    -0.45     0.89 1.00    10483     5984
## Prothrombin           0.94      0.30     0.37     1.55 1.00    11327     6435
## Stage2                1.19      1.73    -2.35     4.72 1.00     7744     5250
## Stage3                0.80      1.63    -2.56     3.99 1.00     8857     5341
## Stage4                0.78      1.67    -2.50     4.19 1.00     8867     5773
## thrombocytopenia1     0.66      0.96    -1.17     2.57 1.00    11219     5930
## elevated_alk_phos    -0.01      3.19    -6.26     6.21 1.00    10300     3079
## normal_copper1       -0.69      0.58    -1.86     0.45 1.00    12884     6525
## normal_albumin1       0.20      0.75    -1.24     1.68 1.00    10464     5231
## Age_Group1           -0.66      1.71    -3.99     2.79 1.00     9703     5904
## Age_Group2            1.30      1.76    -2.06     4.95 1.00     8105     4334
## Age_Group3           -0.03      1.74    -3.60     3.50 1.00     9575     4871
## Symptom_Score1        1.06      1.64    -2.34     4.31 1.00     7625     4672
## Symptom_Score2        1.25      1.92    -2.31     5.38 1.00     7674     5309
## Symptom_Score3       -1.79      2.57    -7.81     2.50 1.00     8442     4482
## Diag_Month            0.90      0.31     0.31     1.53 1.00    11720     6297
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Now fit different model on the synthetic data without considering group effect but considering the first prior

Model3 <- brm(Status ~ ., 
            data = train_data, 
            family = bernoulli(link = "logit"),
            prior = c(set_prior("student_t(4, 1, 2)", class = "b"), 
                      set_prior("normal(-0.5, 0.25)", coef = "Drug1", class = "b")))
## Compiling Stan program...
## Start sampling
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 4.8e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.48 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 0.205 seconds (Warm-up)
## Chain 1:                0.19 seconds (Sampling)
## Chain 1:                0.395 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 9e-06 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.09 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 2: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 2: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 2: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 2: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 0.212 seconds (Warm-up)
## Chain 2:                0.192 seconds (Sampling)
## Chain 2:                0.404 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 9e-06 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.09 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 3: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 3: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 3: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 3: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 0.21 seconds (Warm-up)
## Chain 3:                0.21 seconds (Sampling)
## Chain 3:                0.42 seconds (Total)
## Chain 3: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 4).
## Chain 4: 
## Chain 4: Gradient evaluation took 9e-06 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.09 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4: 
## Chain 4: 
## Chain 4: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 4: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 4: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 4: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 4: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 4: 
## Chain 4:  Elapsed Time: 0.211 seconds (Warm-up)
## Chain 4:                0.185 seconds (Sampling)
## Chain 4:                0.396 seconds (Total)
## Chain 4:
summary(Model3)
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: Status ~ N_Days + Drug + Age + Sex + Ascites + Hepatomegaly + Spiders + Edema + Bilirubin + Cholesterol + Albumin + Copper + Alk_Phos + SGOT + Tryglicerides + Platelets + Prothrombin + Stage + thrombocytopenia + elevated_alk_phos + normal_copper + normal_albumin + Age_Group + Symptom_Score + Diag_Month 
##    Data: train_data (Number of observations: 194) 
##   Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
##          total post-warmup draws = 4000
## 
## Population-Level Effects: 
##                   Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept            -4.96      3.51   -11.62     1.78 1.00     4192     2029
## N_Days               -0.80      0.31    -1.43    -0.21 1.00     4634     3275
## Drug1                -0.22      0.23    -0.68     0.24 1.00     5841     2579
## Age                   0.22      0.52    -0.81     1.22 1.00     4300     2834
## Sex1                  0.74      0.76    -0.75     2.25 1.00     4891     3277
## Ascites1              0.90      1.55    -2.08     4.22 1.00     4365     2586
## Hepatomegaly1        -1.27      0.95    -3.24     0.53 1.00     3292     3110
## Spiders1              0.71      0.91    -1.06     2.49 1.00     3520     3060
## Edema1                5.94      3.09     1.80    13.58 1.00     3440     1846
## Bilirubin             1.39      0.66     0.15     2.70 1.00     4055     3115
## Cholesterol           0.05      0.29    -0.51     0.62 1.00     4770     2823
## Albumin               0.10      0.46    -0.82     0.97 1.00     3782     2858
## Copper               -0.13      0.38    -0.88     0.59 1.00     4352     3061
## Alk_Phos              1.03      0.33     0.43     1.72 1.00     4261     3306
## SGOT                  0.59      0.27     0.07     1.11 1.00     4784     3175
## Tryglicerides         0.47      0.32    -0.17     1.12 1.00     4856     3464
## Platelets             0.27      0.32    -0.35     0.90 1.00     3659     3115
## Prothrombin           0.85      0.29     0.31     1.44 1.00     4244     2846
## Stage2                2.34      1.09     0.34     4.66 1.00     3177     2304
## Stage3                1.64      1.06    -0.29     3.92 1.00     2779     2333
## Stage4                1.73      1.10    -0.36     4.05 1.00     3273     2164
## thrombocytopenia1     1.16      0.94    -0.67     2.99 1.00     4556     3166
## elevated_alk_phos     1.00      3.02    -5.00     6.81 1.00     4578     1552
## normal_copper1       -0.48      0.56    -1.59     0.63 1.00     4779     3194
## normal_albumin1       0.19      0.78    -1.37     1.71 1.00     3863     2813
## Age_Group1           -0.10      1.48    -3.07     2.78 1.00     3611     2627
## Age_Group2            2.42      1.35    -0.20     5.20 1.00     3322     2300
## Age_Group3            0.94      1.54    -2.12     4.02 1.00     3437     2695
## Symptom_Score1        1.65      0.92    -0.09     3.49 1.00     3205     2415
## Symptom_Score2        1.76      1.48    -1.04     4.76 1.00     2825     2732
## Symptom_Score3       -1.42      2.37    -6.76     2.62 1.00     3250     2278
## Diag_Month            0.84      0.30     0.26     1.47 1.00     5146     3391
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Comparing the 3 models using loo function

loo(Model1,Model2,Model3)
## Warning: Found 17 observations with a pareto_k > 0.7 in model 'Model1'. It is
## recommended to set 'moment_match = TRUE' in order to perform moment matching
## for problematic observations.
## Warning: Found 10 observations with a pareto_k > 0.7 in model 'Model2'. It is
## recommended to set 'moment_match = TRUE' in order to perform moment matching
## for problematic observations.
## Warning: Found 10 observations with a pareto_k > 0.7 in model 'Model3'. It is
## recommended to set 'moment_match = TRUE' in order to perform moment matching
## for problematic observations.
## Output of model 'Model1':
## 
## Computed from 8000 by 194 log-likelihood matrix
## 
##          Estimate   SE
## elpd_loo    -97.7 11.5
## p_loo        34.5  5.2
## looic       195.4 22.9
## ------
## Monte Carlo SE of elpd_loo is NA.
## 
## Pareto k diagnostic values:
##                          Count Pct.    Min. n_eff
## (-Inf, 0.5]   (good)     154   79.4%   1467      
##  (0.5, 0.7]   (ok)        23   11.9%   397       
##    (0.7, 1]   (bad)       17    8.8%   90        
##    (1, Inf)   (very bad)   0    0.0%   <NA>      
## See help('pareto-k-diagnostic') for details.
## 
## Output of model 'Model2':
## 
## Computed from 8000 by 194 log-likelihood matrix
## 
##          Estimate   SE
## elpd_loo    -93.5 10.4
## p_loo        32.7  4.5
## looic       186.9 20.7
## ------
## Monte Carlo SE of elpd_loo is NA.
## 
## Pareto k diagnostic values:
##                          Count Pct.    Min. n_eff
## (-Inf, 0.5]   (good)     164   84.5%   913       
##  (0.5, 0.7]   (ok)        20   10.3%   330       
##    (0.7, 1]   (bad)        9    4.6%   99        
##    (1, Inf)   (very bad)   1    0.5%   8013      
## See help('pareto-k-diagnostic') for details.
## 
## Output of model 'Model3':
## 
## Computed from 4000 by 194 log-likelihood matrix
## 
##          Estimate   SE
## elpd_loo    -96.7 11.4
## p_loo        32.9  5.2
## looic       193.3 22.8
## ------
## Monte Carlo SE of elpd_loo is NA.
## 
## Pareto k diagnostic values:
##                          Count Pct.    Min. n_eff
## (-Inf, 0.5]   (good)     163   84.0%   600       
##  (0.5, 0.7]   (ok)        21   10.8%   164       
##    (0.7, 1]   (bad)        7    3.6%   3351      
##    (1, Inf)   (very bad)   3    1.5%   38        
## See help('pareto-k-diagnostic') for details.
## 
## Model comparisons:
##        elpd_diff se_diff
## Model2  0.0       0.0   
## Model3 -3.2       3.0   
## Model1 -4.2       3.0

Model diagnostics(mcmc_trace evaluation)

mcmc_plot(Model1, type = "trace")
## No divergences to plot.

mcmc_plot(Model2, type = "trace")
## No divergences to plot.

mcmc_plot(Model3, type = "trace")
## No divergences to plot.

Model evaluation on the test split(prior sentivity analysis and sensitivity)

test_predictions <- predict(Model1, newdata = test_data, type = "response")
predicted_classes <- ifelse(test_predictions > 0.5, 1, 0)
predicted_classes_Model1 <- as.data.frame(predicted_classes)
conf_matrix <- table(predicted_classes_Model1$Estimate, test_data$Status)

TP <- conf_matrix[2, 2]
FP <- conf_matrix[1, 2]
TN <- conf_matrix[1, 1]
FN <- conf_matrix[2, 1]

specificity <- TN / (TN + FP)
sensitivity <- TP / (TP + FN)
test_predictions_Model2 <- predict(Model2, newdata = test_data, type = "response")
predicted_classes_Model2 <- ifelse(test_predictions_Model2 > 0.5, 1, 0)
predicted_classes_Model2 <- as.data.frame(predicted_classes_Model2)
conf_matrix_Model2 <- table(predicted_classes_Model2$Estimate, test_data$Status)

TP_Model2 <- conf_matrix_Model2[2, 2]
FP_Model2 <- conf_matrix_Model2[1, 2]
TN_Model2 <- conf_matrix_Model2[1, 1]
FN_Model2 <- conf_matrix_Model2[2, 1]

specificity_Model2 <- TN_Model2 / (TN_Model2 + FP_Model2)
sensitivity_Model2 <- TP_Model2 / (TP_Model2 + FN_Model2)

test_predictions_Model3 <- predict(Model3, newdata = test_data, type = "response")
predicted_classes_Model3 <- ifelse(test_predictions_Model3 > 0.5, 1, 0)
predicted_classes_Model3 <- as.data.frame(predicted_classes_Model3)
conf_matrix_Model3 <- table(predicted_classes_Model3$Estimate, test_data$Status)

TP_Model3 <- conf_matrix_Model3[2, 2]
FP_Model3 <- conf_matrix_Model3[1, 2]
TN_Model3 <- conf_matrix_Model3[1, 1]
FN_Model3 <- conf_matrix_Model3[2, 1]

specificity_Model3 <- TN_Model3 / (TN_Model3 + FP_Model3)
sensitivity_Model3 <- TP_Model3 / (TP_Model3 + FN_Model3)
comparison_results <- data.frame(Model = c("Model1", "Model2", "Model3"),
                                 Specificity = c(specificity, specificity_Model2, specificity_Model3),
                                 Sensitivity = c(sensitivity, sensitivity_Model2, sensitivity_Model3))

print(comparison_results)
##    Model Specificity Sensitivity
## 1 Model1   0.7800000   0.6875000
## 2 Model2   0.7708333   0.6470588
## 3 Model3   0.7600000   0.6562500

Stan Code for each of the models

stancode(Model1)
## // generated with brms 2.20.4
## functions {
## }
## data {
##   int<lower=1> N;  // total number of observations
##   array[N] int Y;  // response variable
##   int<lower=1> K;  // number of population-level effects
##   matrix[N, K] X;  // population-level design matrix
##   int<lower=1> Kc;  // number of population-level effects after centering
##   // data for group-level effects of ID 1
##   int<lower=1> N_1;  // number of grouping levels
##   int<lower=1> M_1;  // number of coefficients per level
##   array[N] int<lower=1> J_1;  // grouping indicator per observation
##   // group-level predictor values
##   vector[N] Z_1_1;
##   // data for group-level effects of ID 2
##   int<lower=1> N_2;  // number of grouping levels
##   int<lower=1> M_2;  // number of coefficients per level
##   array[N] int<lower=1> J_2;  // grouping indicator per observation
##   // group-level predictor values
##   vector[N] Z_2_1;
##   // data for group-level effects of ID 3
##   int<lower=1> N_3;  // number of grouping levels
##   int<lower=1> M_3;  // number of coefficients per level
##   array[N] int<lower=1> J_3;  // grouping indicator per observation
##   // group-level predictor values
##   vector[N] Z_3_1;
##   int prior_only;  // should the likelihood be ignored?
## }
## transformed data {
##   matrix[N, Kc] Xc;  // centered version of X without an intercept
##   vector[Kc] means_X;  // column means of X before centering
##   for (i in 2:K) {
##     means_X[i - 1] = mean(X[, i]);
##     Xc[, i - 1] = X[, i] - means_X[i - 1];
##   }
## }
## parameters {
##   vector[Kc] b;  // regression coefficients
##   real Intercept;  // temporary intercept for centered predictors
##   vector<lower=0>[M_1] sd_1;  // group-level standard deviations
##   array[M_1] vector[N_1] z_1;  // standardized group-level effects
##   vector<lower=0>[M_2] sd_2;  // group-level standard deviations
##   array[M_2] vector[N_2] z_2;  // standardized group-level effects
##   vector<lower=0>[M_3] sd_3;  // group-level standard deviations
##   array[M_3] vector[N_3] z_3;  // standardized group-level effects
## }
## transformed parameters {
##   vector[N_1] r_1_1;  // actual group-level effects
##   vector[N_2] r_2_1;  // actual group-level effects
##   vector[N_3] r_3_1;  // actual group-level effects
##   real lprior = 0;  // prior contributions to the log posterior
##   r_1_1 = (sd_1[1] * (z_1[1]));
##   r_2_1 = (sd_2[1] * (z_2[1]));
##   r_3_1 = (sd_3[1] * (z_3[1]));
##   lprior += student_t_lpdf(b[1] | 4, 1, 2);
##   lprior += normal_lpdf(b[2] | -0.5,0.25);
##   lprior += student_t_lpdf(b[3] | 4, 1, 2);
##   lprior += student_t_lpdf(b[4] | 4, 1, 2);
##   lprior += student_t_lpdf(b[5] | 4, 1, 2);
##   lprior += student_t_lpdf(b[6] | 4, 1, 2);
##   lprior += student_t_lpdf(b[7] | 4, 1, 2);
##   lprior += student_t_lpdf(b[8] | 4, 1, 2);
##   lprior += student_t_lpdf(b[9] | 4, 1, 2);
##   lprior += student_t_lpdf(b[10] | 4, 1, 2);
##   lprior += student_t_lpdf(b[11] | 4, 1, 2);
##   lprior += student_t_lpdf(b[12] | 4, 1, 2);
##   lprior += student_t_lpdf(b[13] | 4, 1, 2);
##   lprior += student_t_lpdf(b[14] | 4, 1, 2);
##   lprior += student_t_lpdf(b[15] | 4, 1, 2);
##   lprior += student_t_lpdf(b[16] | 4, 1, 2);
##   lprior += student_t_lpdf(b[17] | 4, 1, 2);
##   lprior += student_t_lpdf(b[18] | 4, 1, 2);
##   lprior += student_t_lpdf(b[19] | 4, 1, 2);
##   lprior += student_t_lpdf(b[20] | 4, 1, 2);
##   lprior += student_t_lpdf(b[21] | 4, 1, 2);
##   lprior += student_t_lpdf(b[22] | 4, 1, 2);
##   lprior += student_t_lpdf(b[23] | 4, 1, 2);
##   lprior += student_t_lpdf(b[24] | 4, 1, 2);
##   lprior += student_t_lpdf(b[25] | 4, 1, 2);
##   lprior += student_t_lpdf(b[26] | 4, 1, 2);
##   lprior += student_t_lpdf(b[27] | 4, 1, 2);
##   lprior += student_t_lpdf(b[28] | 4, 1, 2);
##   lprior += student_t_lpdf(b[29] | 4, 1, 2);
##   lprior += student_t_lpdf(b[30] | 4, 1, 2);
##   lprior += student_t_lpdf(b[31] | 4, 1, 2);
##   lprior += student_t_lpdf(Intercept | 3, 0, 2.5);
##   lprior += student_t_lpdf(sd_1 | 3, 0, 2.5)
##     - 1 * student_t_lccdf(0 | 3, 0, 2.5);
##   lprior += student_t_lpdf(sd_2 | 3, 0, 2.5)
##     - 1 * student_t_lccdf(0 | 3, 0, 2.5);
##   lprior += student_t_lpdf(sd_3 | 3, 0, 2.5)
##     - 1 * student_t_lccdf(0 | 3, 0, 2.5);
## }
## model {
##   // likelihood including constants
##   if (!prior_only) {
##     // initialize linear predictor term
##     vector[N] mu = rep_vector(0.0, N);
##     mu += Intercept;
##     for (n in 1:N) {
##       // add more terms to the linear predictor
##       mu[n] += r_1_1[J_1[n]] * Z_1_1[n] + r_2_1[J_2[n]] * Z_2_1[n] + r_3_1[J_3[n]] * Z_3_1[n];
##     }
##     target += bernoulli_logit_glm_lpmf(Y | Xc, mu, b);
##   }
##   // priors including constants
##   target += lprior;
##   target += std_normal_lpdf(z_1[1]);
##   target += std_normal_lpdf(z_2[1]);
##   target += std_normal_lpdf(z_3[1]);
## }
## generated quantities {
##   // actual population-level intercept
##   real b_Intercept = Intercept - dot_product(means_X, b);
## }
stancode(Model2)
## // generated with brms 2.20.4
## functions {
## }
## data {
##   int<lower=1> N;  // total number of observations
##   array[N] int Y;  // response variable
##   int<lower=1> K;  // number of population-level effects
##   matrix[N, K] X;  // population-level design matrix
##   int<lower=1> Kc;  // number of population-level effects after centering
##   // data for group-level effects of ID 1
##   int<lower=1> N_1;  // number of grouping levels
##   int<lower=1> M_1;  // number of coefficients per level
##   array[N] int<lower=1> J_1;  // grouping indicator per observation
##   // group-level predictor values
##   vector[N] Z_1_1;
##   // data for group-level effects of ID 2
##   int<lower=1> N_2;  // number of grouping levels
##   int<lower=1> M_2;  // number of coefficients per level
##   array[N] int<lower=1> J_2;  // grouping indicator per observation
##   // group-level predictor values
##   vector[N] Z_2_1;
##   // data for group-level effects of ID 3
##   int<lower=1> N_3;  // number of grouping levels
##   int<lower=1> M_3;  // number of coefficients per level
##   array[N] int<lower=1> J_3;  // grouping indicator per observation
##   // group-level predictor values
##   vector[N] Z_3_1;
##   int prior_only;  // should the likelihood be ignored?
## }
## transformed data {
##   matrix[N, Kc] Xc;  // centered version of X without an intercept
##   vector[Kc] means_X;  // column means of X before centering
##   for (i in 2:K) {
##     means_X[i - 1] = mean(X[, i]);
##     Xc[, i - 1] = X[, i] - means_X[i - 1];
##   }
## }
## parameters {
##   vector[Kc] b;  // regression coefficients
##   real Intercept;  // temporary intercept for centered predictors
##   vector<lower=0>[M_1] sd_1;  // group-level standard deviations
##   array[M_1] vector[N_1] z_1;  // standardized group-level effects
##   vector<lower=0>[M_2] sd_2;  // group-level standard deviations
##   array[M_2] vector[N_2] z_2;  // standardized group-level effects
##   vector<lower=0>[M_3] sd_3;  // group-level standard deviations
##   array[M_3] vector[N_3] z_3;  // standardized group-level effects
## }
## transformed parameters {
##   vector[N_1] r_1_1;  // actual group-level effects
##   vector[N_2] r_2_1;  // actual group-level effects
##   vector[N_3] r_3_1;  // actual group-level effects
##   real lprior = 0;  // prior contributions to the log posterior
##   r_1_1 = (sd_1[1] * (z_1[1]));
##   r_2_1 = (sd_2[1] * (z_2[1]));
##   r_3_1 = (sd_3[1] * (z_3[1]));
##   lprior += student_t_lpdf(b | 3, 0, 2);
##   lprior += student_t_lpdf(Intercept | 3, 0, 2.5);
##   lprior += student_t_lpdf(sd_1 | 3, 0, 2.5)
##     - 1 * student_t_lccdf(0 | 3, 0, 2.5);
##   lprior += student_t_lpdf(sd_2 | 3, 0, 2.5)
##     - 1 * student_t_lccdf(0 | 3, 0, 2.5);
##   lprior += student_t_lpdf(sd_3 | 3, 0, 2.5)
##     - 1 * student_t_lccdf(0 | 3, 0, 2.5);
## }
## model {
##   // likelihood including constants
##   if (!prior_only) {
##     // initialize linear predictor term
##     vector[N] mu = rep_vector(0.0, N);
##     mu += Intercept;
##     for (n in 1:N) {
##       // add more terms to the linear predictor
##       mu[n] += r_1_1[J_1[n]] * Z_1_1[n] + r_2_1[J_2[n]] * Z_2_1[n] + r_3_1[J_3[n]] * Z_3_1[n];
##     }
##     target += bernoulli_logit_glm_lpmf(Y | Xc, mu, b);
##   }
##   // priors including constants
##   target += lprior;
##   target += std_normal_lpdf(z_1[1]);
##   target += std_normal_lpdf(z_2[1]);
##   target += std_normal_lpdf(z_3[1]);
## }
## generated quantities {
##   // actual population-level intercept
##   real b_Intercept = Intercept - dot_product(means_X, b);
## }
stancode(Model3)
## // generated with brms 2.20.4
## functions {
## }
## data {
##   int<lower=1> N;  // total number of observations
##   array[N] int Y;  // response variable
##   int<lower=1> K;  // number of population-level effects
##   matrix[N, K] X;  // population-level design matrix
##   int<lower=1> Kc;  // number of population-level effects after centering
##   int prior_only;  // should the likelihood be ignored?
## }
## transformed data {
##   matrix[N, Kc] Xc;  // centered version of X without an intercept
##   vector[Kc] means_X;  // column means of X before centering
##   for (i in 2:K) {
##     means_X[i - 1] = mean(X[, i]);
##     Xc[, i - 1] = X[, i] - means_X[i - 1];
##   }
## }
## parameters {
##   vector[Kc] b;  // regression coefficients
##   real Intercept;  // temporary intercept for centered predictors
## }
## transformed parameters {
##   real lprior = 0;  // prior contributions to the log posterior
##   lprior += student_t_lpdf(b[1] | 4, 1, 2);
##   lprior += normal_lpdf(b[2] | -0.5, 0.25);
##   lprior += student_t_lpdf(b[3] | 4, 1, 2);
##   lprior += student_t_lpdf(b[4] | 4, 1, 2);
##   lprior += student_t_lpdf(b[5] | 4, 1, 2);
##   lprior += student_t_lpdf(b[6] | 4, 1, 2);
##   lprior += student_t_lpdf(b[7] | 4, 1, 2);
##   lprior += student_t_lpdf(b[8] | 4, 1, 2);
##   lprior += student_t_lpdf(b[9] | 4, 1, 2);
##   lprior += student_t_lpdf(b[10] | 4, 1, 2);
##   lprior += student_t_lpdf(b[11] | 4, 1, 2);
##   lprior += student_t_lpdf(b[12] | 4, 1, 2);
##   lprior += student_t_lpdf(b[13] | 4, 1, 2);
##   lprior += student_t_lpdf(b[14] | 4, 1, 2);
##   lprior += student_t_lpdf(b[15] | 4, 1, 2);
##   lprior += student_t_lpdf(b[16] | 4, 1, 2);
##   lprior += student_t_lpdf(b[17] | 4, 1, 2);
##   lprior += student_t_lpdf(b[18] | 4, 1, 2);
##   lprior += student_t_lpdf(b[19] | 4, 1, 2);
##   lprior += student_t_lpdf(b[20] | 4, 1, 2);
##   lprior += student_t_lpdf(b[21] | 4, 1, 2);
##   lprior += student_t_lpdf(b[22] | 4, 1, 2);
##   lprior += student_t_lpdf(b[23] | 4, 1, 2);
##   lprior += student_t_lpdf(b[24] | 4, 1, 2);
##   lprior += student_t_lpdf(b[25] | 4, 1, 2);
##   lprior += student_t_lpdf(b[26] | 4, 1, 2);
##   lprior += student_t_lpdf(b[27] | 4, 1, 2);
##   lprior += student_t_lpdf(b[28] | 4, 1, 2);
##   lprior += student_t_lpdf(b[29] | 4, 1, 2);
##   lprior += student_t_lpdf(b[30] | 4, 1, 2);
##   lprior += student_t_lpdf(b[31] | 4, 1, 2);
##   lprior += student_t_lpdf(Intercept | 3, 0, 2.5);
## }
## model {
##   // likelihood including constants
##   if (!prior_only) {
##     target += bernoulli_logit_glm_lpmf(Y | Xc, Intercept, b);
##   }
##   // priors including constants
##   target += lprior;
## }
## generated quantities {
##   // actual population-level intercept
##   real b_Intercept = Intercept - dot_product(means_X, b);
## }